import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,LassoCV
House_data = pd.read_csv("../data/kc_house_data.csv",engine='python')
House_data.shape
House_data.head(5)
House_data.info()
House_data.describe()
House_data.bedrooms.value_counts()
House_data.floors.value_counts()
House_data.view.value_counts()
House_data.waterfront.value_counts()
House_data.condition.value_counts()
House_data.grade.value_counts()
House_data['date'] = pd.to_datetime(House_data['date']).dt.to_period('m')
House_data.head()
House_data.isnull().any()
There are no null values or missing values in the data. No data cleansing is required.
House_data.duplicated().sum()
for i in House_data.iloc[:, 2:].columns:
House_data.iloc[:, 1:].boxplot(column=i)
plt.show()
House_data['bedrooms'].value_counts().plot(kind='bar')
plt.title('number of Bedrooms')
plt.xlabel('Bedrooms')
plt.ylabel('Count')
sns.despine
The most common type of house sold are the ones with three bedrooms. So this is helpful as we can understand 3 bedroom houses have more demand followed by 4 bedroom. So # of bedrooms might be an important factor while we fix the price.
plt.figure(figsize=(10,10))
sns.jointplot(x=House_data.lat.values, y=House_data.long.values, height=10)
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
plt.show()
sns.despine
we see large number of houses between latitudes 47.5 and 47.8 and in terms of longitude there are large number of houses between -122.2 and -122.4. This location might be the ideal location for people to live and house prices might vary based up on this lattitude and longitude
HData_attr = House_data.iloc[:, 0:21]
sns.pairplot(HData_attr, diag_kind='kde')
house_corr = House_data.corr(method ='pearson')
house_corr
plt.figure(figsize=(15,15))
corr = House_data.apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,
linewidths=.2, cmap="YlGnBu")
House_data.columns
X = House_data.drop(['price','id','date', 'zipcode', 'yr_renovated'],axis=1)
y = House_data[['price']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=110)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
ridge = Ridge(alpha=.3, normalize=True)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
from sklearn.metrics import r2_score, get_scorer
from sklearn.linear_model import Lasso, Ridge, LassoCV,LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, cross_validate, train_test_split
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
lasso_alphas = np.linspace(0, 0.2, 21)
lasso = Lasso()
grid = dict()
grid['alpha'] = lasso_alphas
gscv = GridSearchCV(lasso,
grid,
scoring='neg_mean_absolute_error',
cv=cv,
n_jobs=-1)
results = gscv.fit(X_train, y_train)
print('MAE: %.5f' % results.best_score_)
print('Config: %s' % results.best_params_)
lasso = Lasso(alpha=0.2, normalize=True)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
There is no considerable increase in the Rsquare value when Ridge or Lasso regression are used instead of linear regression
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
dtree_up = DecisionTreeRegressor()
dtree_up.fit(X_train, y_train) # Fitting model with x_train and y_train
dtree_pred_up = dtree_up.predict(X_test) # Predicting the results
print('RMSE:', np.sqrt(mean_squared_error(y_test, dtree_pred_up, squared=False)))
print('r2 score: %.2f' % r2_score(y_test, dtree_pred_up))
print("Accuracy :",dtree_up.score(X_test, y_test))
dtree = DecisionTreeRegressor(random_state=5)
d=np.arange(1,21,1)
hyperParam = [{'max_depth':d}]
gsv = GridSearchCV(dtree,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(X_train, y_train) # Fitting model with xtrain_scaler and y_train
dtree_pred_mms = best_model.best_estimator_.predict(X_test) # Predicting the results
print("Best HyperParameter: ",gsv.best_params_)
print('RMSE:', np.sqrt(mean_squared_error(y_test, dtree_pred_mms, squared=False)))
print('r2 score: %.2f' % r2_score(y_test, dtree_pred_mms))
print("Accuracy :",best_model.score(X_test, y_test))
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
rf = RandomForestRegressor()
rf.fit(X_train, y_train) # Fitting model with x_train and y_train
rf_pred = rf.predict(X_test) # Predicting the results
print('RMSE:', np.sqrt(mean_squared_error(y_test, rf_pred, squared=False)))
print('r2 score: %.2f' % r2_score(y_test, rf_pred))
print("Accuracy :",rf.score(X_test, y_test))
nEstimator = [140,160,180,200,220]
depth = [10,15,20,25,30]
RF = RandomForestRegressor()
hyperParam = [{'n_estimators':nEstimator,'max_depth': depth}]
gsv = GridSearchCV(RF,hyperParam,cv=5,verbose=1,scoring='r2',n_jobs=-1)
gsv.fit(X_train, y_train)
print("Best HyperParameter: ",gsv.best_params_)
scores = gsv.cv_results_['mean_test_score'].reshape(len(nEstimator),len(depth))
maxDepth=gsv.best_params_['max_depth']
nEstimators=gsv.best_params_['n_estimators']
model = RandomForestRegressor(n_estimators = nEstimators,max_depth=maxDepth)
model.fit(X_train, y_train) # Fitting model with x_train and y_train
# Predicting the results:
rf_pred_tune = model.predict(X_test)
print('RMSE:', np.sqrt(mean_squared_error(y_test, rf_pred_tune, squared=False)))
print('r2 score: %.2f' % r2_score(y_test, rf_pred_tune))
print("Accuracy :",model.score(X_test, y_test))
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
learning_rate = 0.1, loss = 'ls')
ensemble.GradientBoostingRegressor?
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
y_predict
clf.score(X_test,y_test)
clf.score?
gb_pred = clf.predict(X_test) # Predicting the results
print('RMSE:', np.sqrt(mean_squared_error(y_test, gb_pred, squared=False)))
print('r2 score: %.2f' % r2_score(y_test, gb_pred))
print("Accuracy :",clf.score(X_test, y_test))